package com.asiainfo.zqx;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.select.Elements;

import java.text.ParseException;
import java.text.SimpleDateFormat;

public class XinJingbao extends BreadthCrawler {
    private String seedurl="https://www.bjnews.com.cn/point";
    private String regurl="https://www.bjnews.com.cn/detail/.*.html";
    public XinJingbao(String crawlPath) {
        super(crawlPath, false);
        addSeed(seedurl);
        setThreads(1);
        //setResumable(true);
    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {
        String contentType=page.contentType();
        if(contentType==null){
            return ;
        }
        if(page.matchType("text")){
            String url=page.url();
            String title=page.select("div.content>h1").text();
            String source="新京报";
            String time=page.select("span.timer").text();
           String writer=page.select("span.reporter").text();
           String  text=page.select("div.content-name").text();
            System.out.println(url);
            System.out.println(title);
            if (time.length()>"2022-07-08 16:16".length()){
                time = time.trim().substring(0,"2022-07-08 16:16".length());
            }
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
            try {
                System.out.println(sdf.parse(time));
            } catch (ParseException e) {
                throw new RuntimeException(e);
            }
            System.out.println(writer);
            System.out.println(text);

        }else {
           Elements elements=page.select("a");
           for(int i=0;i<elements.size();i++){
               if(elements.get(i).attr("abs:href").matches(regurl)){
                   crawlDatums.add(new CrawlDatum(elements.get(i).attr("abs:href"),"text"));
               }
           }

        }
    }

    public static void main(String[] args) throws Exception {
        XinJingbao xin=new XinJingbao("xin");
        xin.start(2);
    }
}
